graph TB
    A["Input Image (batch_size, 3, H, W)"]
    
    A --> B[ViT Branch]
    A --> C[ResNet50 Branch]
    
    B --> D["ViT Output: (bs, 768)"]
    D --> E["Unsqueeze: (bs, 768, 1, 1)"]
    
    C --> F["ResNet Output: (bs, 2048, H', W')"]
    
    E --> G[Feature Fusion]
    F --> G
    G --> H["Fused: (bs, 2816, H', W')"]
    
    H --> I[Self-Attention]
    I --> J["Output: (bs, 2816, H', W')"]
    
    J --> K[Decoder]
    K --> L["Final: (bs, num_classes)"]